In [1]:
import feedparser
import sys
import time
from pymongo import MongoClient
feeds = [['detik','http://detik.feedsportal.com/c/33613/f/656089/index.rss'], ['viva','http://rss.viva.co.id/get/bisnis'], ['merdeka','http://www.merdeka.com/feed/'], ['liputan6','http://www.liputan6.com/feed/rss2'], ['tribun','http://www.tribunnews.com/rss/bisnis'], ['okezone','http://sindikasi.okezone.com/index.php/rss/11/RSS2.0'], ['jpnn','http://www.jpnn.com/index.php?mib=rss&id=216'], ['suara','http://www.suara.com/rss/bisnis'], ['bisniscom','http://www.bisnis.com/rss/index?c=382']]
In [1]:
# server = 'localhost'
# port = 27017
# db_name = 'thosangs-news'
# username = 'userSkripsi'
In [3]:
import pymongo
print 'Python version', sys.version
print 'Pymongo version', pymongo.version
# connect to server
print '\nConnecting ...'
conn = MongoClient(server, port)
# Get the database
print '\nGetting database ...'
db = conn[db_name]
# Have to authenticate to get access
print '\nAuthenticating ...'
db.authenticate(username, password)
Out[3]:
for i in range(len(feeds)): print(feeds[i][1])
pos = db.newsLink pos.insert_many([{ 'name' : feeds[i][0],'link' : feeds[i][1]} for i in range(len(feeds))])
In [4]:
link = db.newsLink.find()
feeds = []
for l in link:
dummy = {}
dummy['name'] = l['name']
dummy['link'] = l['link']
feeds.append(dummy)
In [29]:
doc = dict()
for i in range(len(feeds)):
start_time = time.time()
doc[feeds[i]['name']] = feedparser.parse(feeds[i]['link'])['entries']
for j in range(len(doc[feeds[i]['name']])) : doc[feeds[i]['name']][j].pop('published_parsed')
print '{0} {1}-News {2}-Seconds'.format(feeds[i]['name'],len(doc[feeds[i]['name']]),(time.time()-start_time))
In [26]:
pos = db.news
for linkBerita in doc:
for berita in doc[linkBerita]:
if (pos.find({'link' : berita['link']})):
doc[linkBerita].remove(berita)
else:
print 'TIDAK '+berita['link']
In [31]:
dd = pos.find({'link' : doc['viva'][1]['link']})
print doc['detik'][1]['link']
In [28]:
pos.insert_many([doc[feeds[i]['name']][j] for i in range(len(feeds)) for j in range(len(doc[feeds[i]['name']]))])
In [ ]:
d = pos.find
In [11]:
#testing ekstrak deskripsi singkat detik.com
def FindShortDesc(desc,cekawal,cekakhir):
cek = desc
awal = cek.find(cekawal)+len(cekawal)
akhir = cek.find(cekakhir)
return cek[awal:akhir]
for j in range(0,len(ds)):
judul = ds.title[j]
shortdesc = FindShortDesc(ds.summary[j],'width="100" />','<br c')
img = FindShortDesc(ds.summary[j],'src="','" width')
link = ds.id[j]
print judul
print shortdesc
print img
print link+'\n'
In [6]:
print(FindShortDesc(ds.summary[0],'src="','" width'))